Module 13 Elements of Regression Analysis

library(curl)
## Using libcurl 7.88.1 with LibreSSL/3.3.6
library(car)
## Loading required package: carData
f <- curl("https://raw.githubusercontent.com/fuzzyatelin/fuzzyatelin.github.io/master/AN588_Fall23/zombies.csv")
d <- read.csv(f, header = TRUE, sep = ",", stringsAsFactors = FALSE)

Analysis of Variance and ANOVA Tables

m <- lm(data = d, height ~ weight)
SSY <- sum((m$model$height - mean(m$model$height))^2)  # height - mean(height)
SSY
## [1] 18558.61
SSR <- sum((m$fitted.values - mean(m$model$height))^2)  # predicted height - mean height
SSR
## [1] 12864.82
SSE <- sum((m$model$height - m$fitted.values)^2)  # height - predicted height
SSE
## [1] 5693.785
df_regression <- 1
df_error <- 998
df_y <- 999
MSR <- SSR/df_regression
MSE <- SSE/df_error
MSY <- SSY/df_y

fratio <- MSR/MSE
fratio
## [1] 2254.931

Evaluate F ratio test statistic against F distribution

curve(df(x, df = 1, df2 = 1), col = "green", lty = 3, lwd = 2, xlim = c(0, 10),
    main = "Some Example F Distributions\n(vertical line shows critical value for df1=1,df2=998)",
    ylab = "f(x)", xlab = "x")
curve(df(x, df = 2, df2 = 2), col = "blue", lty = 3, lwd = 2, add = TRUE)
curve(df(x, df = 4, df2 = 4), col = "red", lty = 3, lwd = 2, add = TRUE)
curve(df(x, df = 8, df2 = 6), col = "purple", lty = 3, lwd = 2, add = TRUE)
curve(df(x, df = 1, df2 = 998), col = "black", lwd = 3, add = TRUE)
legend("top", c("df1=1,df2=1", "df1=2,df2=2", "df1=4,df2=4", "df1=8,df2=6",
    "df1=1,df2=998"), lty = 3, lwd = 2, col = c("green", "blue", "red", "purple",
    "black"), bty = "n", cex = 0.75)

fcrit <- qf(p = 0.95, df1 = 1, df2 = 998)
fcrit
## [1] 3.850793
abline(v = fcrit)
abline(h = 0)
polygon(cbind(c(fcrit, seq(from = fcrit, to = 10, length.out = 1000), 8), c(0,
    df(seq(from = fcrit, to = 8, length.out = 1000), df1 = 1, df2 = 998), 0)),
    border = "black", col = "grey")

1 - pf(q = fratio, df1 = 1, df2 = 998)
## [1] 0
a <- aov(data = d, height ~ weight)
summary(a)
##              Df Sum Sq Mean Sq F value Pr(>F)    
## weight        1  12865   12865    2255 <2e-16 ***
## Residuals   998   5694       6                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary.aov(m)
##              Df Sum Sq Mean Sq F value Pr(>F)    
## weight        1  12865   12865    2255 <2e-16 ***
## Residuals   998   5694       6                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
rsquared <- SSR/SSY
rsquared
## [1] 0.6931998
rho <- sqrt(rsquared)
rho
## [1] 0.8325862

Standard Errors of Regression Coefficients

SSX <- sum((m$model$weight - mean(m$model$weight))^2)
SEbeta1 <- sqrt(MSE/SSX)
SEbeta1
## [1] 0.004106858
SEbeta0 <- sqrt((MSE * sum(m$model$weight^2))/(1000 * SSX))
SEbeta0
## [1] 0.5958147
SEyhat <- sqrt(MSE * (1/1000 + (m$model$weight - mean(m$model$weight))^2/SSX))
head(SEyhat)
## [1] 0.08978724 0.07620966 0.08414480 0.09533986 0.08904151 0.08341218
summary(m)
## 
## Call:
## lm(formula = height ~ weight, data = d)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.1519 -1.5206 -0.0535  1.5167  9.4439 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 39.565446   0.595815   66.41   <2e-16 ***
## weight       0.195019   0.004107   47.49   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.389 on 998 degrees of freedom
## Multiple R-squared:  0.6932, Adjusted R-squared:  0.6929 
## F-statistic:  2255 on 1 and 998 DF,  p-value: < 2.2e-16

Challenge 1

Calculate the residuals from the regression of zombie height on weight and plot these in relation to weight (the x variable).

m <- lm(data = d, height ~ weight)
plot(x = d$weight, y = m$residuals)

# or
e <- resid(m)
plot(x = d$weight, y = e)

Now, plot a histogram of your residuals… ideally they are normally distributed!

hist(e, xlim = c(-4 * sd(e), 4 * sd(e)), breaks = 20, main = "Histogram of Residuals")

plot(m$model$weight, m$residuals)

par(mfrow = c(2, 2))
plot(m)

qqnorm(m$residuals)

qqPlot(m$residuals)

## [1] 954 799

Tests for normality

s <- shapiro.test(m$residuals)
s
## 
##  Shapiro-Wilk normality test
## 
## data:  m$residuals
## W = 0.99713, p-value = 0.07041

Challenge 2

f <- curl("https://raw.githubusercontent.com/fuzzyatelin/fuzzyatelin.github.io/master/AN588_Fall23/KamilarAndCooperData.csv")
d <- read.csv(f, header = TRUE, sep = ",", stringsAsFactors = FALSE)
head(d)
##               Scientific_Name          Family          Genus      Species
## 1 Allenopithecus_nigroviridis Cercopithecidae Allenopithecus nigroviridis
## 2         Allocebus_trichotis Cercopithecidae      Allocebus    trichotis
## 3           Alouatta_belzebul        Atelidae       Alouatta     belzebul
## 4             Alouatta_caraya        Atelidae       Alouatta       caraya
## 5            Alouatta_guariba        Atelidae       Alouatta      guariba
## 6           Alouatta_palliata        Atelidae       Alouatta     palliata
##   Brain_Size_Species_Mean Brain_Size_Female_Mean   Brain_size_Ref
## 1                   58.02                  53.70 Isler et al 2008
## 2                      NA                     NA                 
## 3                   52.84                  51.19 Isler et al 2008
## 4                   52.63                  47.80 Isler et al 2008
## 5                   51.70                  49.08 Isler et al 2008
## 6                   49.88                  48.04 Isler et al 2008
##   Body_mass_male_mean Body_mass_female_mean Mass_Dimorphism
## 1                6130                  3180           1.928
## 2                  92                    84           1.095
## 3                7270                  5520           1.317
## 4                6525                  4240           1.539
## 5                5800                  4550           1.275
## 6                7150                  5350           1.336
##                 Mass_Ref MeanGroupSize AdultMales AdultFemale AdultSexRatio
## 1       Isler et al 2008            NA         NA          NA            NA
## 2 Smith and Jungers 1997          1.00       1.00         1.0            NA
## 3       Isler et al 2008          7.00       1.00         1.0          1.00
## 4       Isler et al 2008          8.00       2.30         3.3          1.43
## 5       Isler et al 2008          6.53       1.37         2.2          1.61
## 6       Isler et al 2008         12.00       2.90         6.3          2.17
##                                                     Social_Organization_Ref
## 1                                                                          
## 2                                                             Kappeler 1997
## 3                                                       Campbell et al 2007
## 4 van Schaik et al. 1999; Kappeler and Pereira 2003; Nunn & van Schaik 2000
## 5                                                       Campbell et al 2007
## 6 van Schaik et al. 1999; Kappeler and Pereira 2003; Nunn & van Schaik 2000
##   InterbirthInterval_d Gestation WeaningAge_d MaxLongevity_m LitterSz
## 1                   NA        NA       106.15          276.0     1.01
## 2                   NA        NA           NA             NA     1.00
## 3                   NA        NA           NA             NA       NA
## 4               337.62       187       323.16          243.6     1.01
## 5                   NA        NA           NA             NA       NA
## 6               684.37       186       495.60          300.0     1.02
##    Life_History_Ref GR_MidRangeLat_dd Precip_Mean_mm Temp_Mean_degC AET_Mean_mm
## 1 Jones et al. 2009             -0.17         1574.0           25.2      1517.8
## 2                              -16.59         1902.3           20.3      1388.2
## 3                               -6.80         1643.5           24.9      1286.6
## 4 Jones et al. 2009            -20.34         1166.4           22.9      1193.1
## 5                              -21.13         1332.3           19.6      1225.7
## 6 Jones et al. 2009              6.95         1852.6           23.7      1300.0
##   PET_Mean_mm       Climate_Ref HomeRange_km2      HomeRangeRef DayLength_km
## 1      1589.4 Jones et al. 2009            NA                             NA
## 2      1653.7 Jones et al. 2009            NA                             NA
## 3      1549.8 Jones et al. 2009            NA                             NA
## 4      1404.9 Jones et al. 2009            NA                           0.40
## 5      1332.2 Jones et al. 2009          0.03 Jones et al. 2009           NA
## 6      1633.9 Jones et al. 2009          0.19 Jones et al. 2009         0.32
##       DayLengthRef Territoriality Fruit Leaves Fauna             DietRef1
## 1                              NA    NA                                  
## 2                              NA    NA                                  
## 3                              NA  57.3   19.1   0.0 Campbell et al. 2007
## 4 Nunn et al. 2003             NA  23.8   67.7   0.0 Campbell et al. 2007
## 5                              NA   5.2   73.0   0.0 Campbell et al. 2007
## 6 Nunn et al. 2003         0.6506  33.1   56.4   0.0 Campbell et al. 2007
##   Canine_Dimorphism Canine_Dimorphism_Ref  Feed  Move  Rest Social
## 1             2.210   Plavcan & Ruff 2008    NA    NA    NA     NA
## 2                NA                          NA    NA    NA     NA
## 3             1.811   Plavcan & Ruff 2008 13.75 18.75 57.30  10.00
## 4             1.542   Plavcan & Ruff 2008 15.90 17.60 61.60   4.90
## 5             1.783   Plavcan & Ruff 2008 18.33 14.33 64.37   3.00
## 6             1.703   Plavcan & Ruff 2008 17.94 12.32 66.14   3.64
##    Activity_Budget_Ref
## 1                     
## 2                     
## 3 Campbell et al. 2007
## 4 Campbell et al. 2007
## 5 Campbell et al. 2007
## 6 Campbell et al. 2007
plot(data = d, WeaningAge_d ~ Body_mass_female_mean)

model <- lm(data = d, WeaningAge_d ~ Body_mass_female_mean)
summary(model)
## 
## Call:
## lm(formula = WeaningAge_d ~ Body_mass_female_mean, data = d)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -720.39 -115.48  -54.05   58.11  471.22 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           2.016e+02  2.012e+01   10.02   <2e-16 ***
## Body_mass_female_mean 2.013e-02  1.927e-03   10.44   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 183.4 on 114 degrees of freedom
##   (97 observations deleted due to missingness)
## Multiple R-squared:  0.489,  Adjusted R-squared:  0.4845 
## F-statistic: 109.1 on 1 and 114 DF,  p-value: < 2.2e-16
plot(model)

qqPlot(model$residuals)

## 97 44 
## 48 24
# Test for normality
s <- shapiro.test(model$residuals)
s
## 
##  Shapiro-Wilk normality test
## 
## data:  model$residuals
## W = 0.86291, p-value = 5.825e-09

Data Transformations

Challenge 3

d$logWeaningAge <- log(d$WeaningAge_d)
d$logFemaleBodyMass <- log(d$Body_mass_female_mean)
plot(data = d, logWeaningAge ~ logFemaleBodyMass)

model <- lm(data = d, logWeaningAge ~ logFemaleBodyMass)
summary(model)
## 
## Call:
## lm(formula = logWeaningAge ~ logFemaleBodyMass, data = d)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.10639 -0.32736  0.00848  0.32214  1.11010 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         1.7590     0.2196   8.011 1.08e-12 ***
## logFemaleBodyMass   0.4721     0.0278  16.983  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4532 on 114 degrees of freedom
##   (97 observations deleted due to missingness)
## Multiple R-squared:  0.7167, Adjusted R-squared:  0.7142 
## F-statistic: 288.4 on 1 and 114 DF,  p-value: < 2.2e-16
qqPlot(model$residuals)

##  44 213 
##  24 116
s <- shapiro.test(model$residuals)
s
## 
##  Shapiro-Wilk normality test
## 
## data:  model$residuals
## W = 0.99367, p-value = 0.8793
par(mfrow = c(1, 2))

a <- 2
b <- 2

# log x
x <- seq(from = 0, to = 100, length.out = 1000)
y <- a + b * log(x)
plot(x, y, type = "l", main = "untransformed")
plot(log(x), y, type = "l", main = "log(x)")

# log y
x <- seq(from = 0, to = 10, length.out = 1000)
y <- exp(a + b * x)
plot(x, y, type = "l", main = "untransformed")

plot(x, log(y), type = "l", main = "log(y)")

# assymptotic
x <- seq(from = 1, to = 100, length.out = 100)
y <- (a * x)/(1 + b * x)
plot(x, y, type = "l", main = "untransformed")

plot(1/x, y, type = "l", main = "1/x")

# reciprocal
x <- seq(from = 1, to = 100, length.out = 100)
y <- a + b/x
plot(x, y, type = "l", main = "untransformed")

plot(1/x, y, type = "l", main = "1/x")

# power
x <- seq(from = 1, to = 100, length.out = 100)
y <- a * x^b
plot(x, y, type = "l", main = "untransformed")

plot(x^b, y, type = "l", main = "x^b")

# exp
x <- seq(from = 1, to = 10, length.out = 100)
y <- a * exp(b * x)
plot(x, y, type = "l", main = "untransformed")

plot(x, log(y), type = "l", main = "log(y)")